{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "#Example blast command\n",
    "blastn -task blastn -query results/ata/families.fasta  -subject ../data/tair10/TAIR10_chr_all.fas -outfmt \"6 qseqid sseqid qstart qend sstart send score length mismatch gaps gapopen nident pident evalue qlen slen qcovs\" > results/ata/blast_families_ata.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_blast = 'results/ata/blast_families_ata.csv'\n",
    "path_blast_filtered = 'results/ata/blast_families_ata.filtered.csv'\n",
    "params = {'min_len':50,'max_len':False,'min_distance':10,'max_q':1.2,'min_q':0.7,'min_pident':80,'min_qcov':80}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('initial:', 114931)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'results/ata/blast_families_ata.csv'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(path_blast, sep='\\t', header=None)\n",
    "df.columns = ['qseqid','sseqid','qstart','qend','sstart','send','score','length','mismatch','gaps','gapopen','nident','pident','evalue','qlen','slen','qcovs']\n",
    "print('initial:',len(df.index))\n",
    "initial = len(df.index)\n",
    "path_blast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min len: 114931\n"
     ]
    }
   ],
   "source": [
    "#filter by length\n",
    "if(params['min_len']):\n",
    "    df = df[df.qlen > params['min_len']]\n",
    "print('Min len: ' + str(len(df.index)))\n",
    "min_length = str(len(df.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max len: 114931\n"
     ]
    }
   ],
   "source": [
    "if(params['max_len']):\n",
    "    df = df[df.qlen < params['max_len']]\n",
    "print('Max len: ' + str(len(df.index)))    \n",
    "max_length = str(len(df.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('min treshold:', 11855)\n"
     ]
    }
   ],
   "source": [
    "#filter by query / subject length treshold\n",
    "df = df[((df.length / df.qlen) >= params['min_q'])]\n",
    "print('min treshold:',len(df.index))\n",
    "min_treshold = str(len(df.index))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('max treshold:', 11854)\n"
     ]
    }
   ],
   "source": [
    "df = df[((df.length / df.qlen) <= params['max_q'])]\n",
    "print('max treshold:',len(df.index))\n",
    "max_treshold = str(len(df.index))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min_pident: 5777\n"
     ]
    }
   ],
   "source": [
    "#filter by pident\n",
    "df = df[(df.pident >= params['min_pident'])]\n",
    "print('Min_pident: ' + str(len(df.index)))\n",
    "min_pident = str(len(df.index))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min qcov: 5776\n"
     ]
    }
   ],
   "source": [
    "#filter by qcov\n",
    "df = df[(df.qcovs >= params['min_qcov'])]\n",
    "print('Min qcov: ' + str(len(df.index)))\n",
    "min_qcov = str(len(df.index))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done\n"
     ]
    }
   ],
   "source": [
    "#order sstart and send\n",
    "df['new_sstart'] = df[['sstart','send']].min(axis=1)\n",
    "df['new_ssend'] = df[['sstart','send']].max(axis=1)\n",
    "df['sstart'] = df['new_sstart']\n",
    "df['send'] = df['new_ssend']\n",
    "df = df.drop('new_sstart',axis=1).drop('new_ssend',axis=1)\n",
    "df = df.sort_values(by=['sseqid','sstart', 'send'])\n",
    "df = df.reset_index(drop=True)\n",
    "# sep by chr\n",
    "dfs = {}\n",
    "for seq in df.sseqid.unique():\n",
    "    dfs[seq] = df[df.sseqid == seq]\n",
    "print('done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "2\n",
      "3\n",
      "4\n",
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n",
      "10\n",
      "11\n"
     ]
    }
   ],
   "source": [
    "# filter overlapped \n",
    "rows = []\n",
    "discard = []\n",
    "total = len(df.index)\n",
    "count = 0\n",
    "curr = 0\n",
    "for index, row in df.iterrows():\n",
    "    count += 1\n",
    "    curr_new = int(count * 100 * 1.0 / (total * 1.0))\n",
    "    if curr_new != curr:\n",
    "        curr = curr_new\n",
    "        if curr_new % 1 == 0:\n",
    "            print(curr_new)\n",
    "    if index in discard:\n",
    "        continue\n",
    "    for k2, v2 in df.loc[index:,].iterrows():\n",
    "        if abs(v2.sstart - row.sstart) > params['min_distance']:\n",
    "            break\n",
    "        if abs(v2.sstart - row.sstart) <= params['min_distance'] and abs(v2.send - row.send) <= params['min_distance']:\n",
    "            discard.append(k2)\n",
    "    rows.append(row)\n",
    "print('done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(rows)\n",
    "print('Non overlapped: ' + str(len(df.index)))\n",
    "non_overlapped = str(len(df.index))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(path_blast_filtered, index=None, sep='\\t')\n",
    "path_blast_filtered\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Initial: ' + str(initial))\n",
    "print('Min len: ' + str(min_length))\n",
    "print('Max len: ' + str(max_length))\n",
    "print('Min treshold: ' + str(min_treshold))\n",
    "print('Max treshold: ' + str(max_treshold))\n",
    "print('Min pident: ' + str(min_pident))\n",
    "print('Min qcov: ' + str(min_qcov))\n",
    "print('Non overlapped: ' + str(non_overlapped))\n",
    "print('Saved: ' + path_blast_filtered)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
